library(plotly)
library(DataExplorer)
library(esquisse)
library(data.table)
library(psych)
library(MASS)
library(klaR)
library(tidyverse)
library(magrittr)
library(readr)
library(NbClust)
library(ggmap)
library(maps)
library(htmlwidgets)
one_star_michelin_restaurants <- read_csv("one-star-michelin-restaurants.csv",
col_types = cols(latitude = col_character(),
longitude = col_character(),
year = col_integer(),
zipCode = col_character())) %>%
mutate(., stars=1)
two_star_michelin_restaurants <- read_csv("two-stars-michelin-restaurants.csv",
col_types = cols(latitude = col_character(),
longitude = col_character(),
year = col_integer(),
zipCode = col_character())) %>%
mutate(., stars=2)
three_star_michelin_restaurants <- read_csv("three-stars-michelin-restaurants.csv",
col_types = cols(latitude = col_character(),
longitude = col_character(),
year = col_integer(),
zipCode = col_character())) %>%
mutate(., stars=3,)
michelin<-one_star_michelin_restaurants %>%
bind_rows(., two_star_michelin_restaurants) %>%
bind_rows(., three_star_michelin_restaurants) %>%
mutate(.,
city=factor(.$city),
region=factor(.$region),
zipCode=factor(.$zipCode),
cuisine=factor(.$cuisine),
price=factor(str_length(.$price)),
stars=factor(.$stars),
latitude=as.numeric(.$latitude),
longitude=as.numeric(.$longitude))%>%
mutate(., #Below is a simplified cuisine listing, as a result of restaurants defining their type of food as things like "innovative" and "classic cuisine"
cuisine.collapsed=fct_collapse(.$cuisine,
American= c("American","Californian"),
Australian= c("Australian"),
Chinese= c("Cantonese","Hang Zhou","Sichuan-Huai Yang",
"Cantonese Roast Meats","Fujian","Hunanese and Sichuan",
"Chinese","Shanghainese","Taiwanese","Dim Sum","Sichuan","Taizhou","Noodles and congee"),
French= c("Classic French","French contemporary","Creative French","French","Modern French"),
British= c("Creative British","Traditional British","Modern British"),
European= c("European","Austrian","European contemporary","Danish",
"Finnish"),
Meats=c("Meats and grills","Barbecue"),
Modern=c("modern","Contemporary","creative","Fusion","Modern cuisine","Creative",
"Gastropub","Innovative"),
Other= c("International","Street Food",
"Temple cuisine","Seafood","Vegetarian",
"Classic cuisine","Market cuisine","Regional cuisine","Steakhouse"),
Japanese= c("Japanese contemporary","Sushi","Teppanyaki","Japanese"),
Moroccan= c("Moroccan"),
Scandinavian= c("Scandinavian"),
Asian= c("Asian","Asian contemporary","Asian influences"),
Italian= c("Italian","Italian contemporary"),
Korean= c("Korean","Korean contemporary"),
Mediterranean= c("Mediterranean","Mediterranean cuisine"),
Thai= c("Southern Thai","Thai","Thai Contemporary"),
Indian= c("Indian"),
Malaysian= c("Peranakan"),
Spanish= c("Spanish"),
Mexican= c("Mexican")))
head(michelin)
## # A tibble: 6 x 12
## name year latitude longitude city region zipCode cuisine price url stars
## <chr> <int> <dbl> <dbl> <fct> <fct> <fct> <fct> <fct> <chr> <fct>
## 1 Kili… 2019 47.3 10.2 Klei… Austr… 87568 Creati… 5 http… 1
## 2 Pfef… 2019 47.8 13.1 Hall… Austr… 5300 Classi… 5 http… 1
## 3 Essz… 2019 47.8 13.0 Salz… Austr… 5020 Creati… 5 http… 1
## 4 Carp… 2019 47.8 13.0 Salz… Austr… 5020 Market… 5 http… 1
## 5 Edva… 2019 48.2 16.4 Wien Austr… 1010 Modern… 4 http… 1
## 6 Das … 2019 48.2 16.4 Wien Austr… 1020 Modern… 5 http… 1
## # … with 1 more variable: cuisine.collapsed <fct>
michelin.kmodes <- michelin %>%
mutate_if(sapply(., is.factor), as.numeric)%>%
dplyr::select(.,c("cuisine.collapsed","price","stars"))
NbClust(michelin.kmodes,
distance="euclidean",
min.nc=2,
max.nc=20,
method="ward.D")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 3 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 7 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 1 proposed 17 as the best number of clusters
## * 3 proposed 18 as the best number of clusters
## * 2 proposed 20 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 4
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 1.3679 731.7359 464.9049 40.7610 2678.303 1741830711 4705895.93 4530.4576
## 3 1.4050 842.5484 350.4133 27.2827 3283.952 1639561583 1754556.80 2711.4551
## 4 59.3874 961.5437 107.5604 30.2000 3717.357 1562346902 631079.75 1799.9837
## 5 0.0183 859.0576 312.7938 27.6781 3976.057 1682441369 416556.72 1557.5387
## 6 6.2031 1059.8120 111.6136 32.8812 4450.720 1223750354 133819.39 1071.7075
## 7 1.8378 1043.3315 84.4441 32.6902 4731.470 1112115971 122605.87 922.3007
## 8 1.7926 1014.6338 67.5305 32.1847 4939.855 1076261980 96703.74 821.4742
## 9 0.3089 982.0834 88.4125 31.5522 5328.530 778656832 86043.23 747.9522
## 10 2.0641 993.8443 63.9171 32.0367 5505.345 745370017 56754.19 662.5606
## 11 4.4886 982.8763 45.0739 31.9403 5650.046 732378809 43815.08 606.0137
## 12 0.2506 955.1042 21.0692 31.3949 5765.181 738527889 36554.08 568.5478
## 13 0.5327 902.9517 66.4582 30.1462 5811.845 810460244 33774.98 551.5341
## 14 3.0261 918.4780 21.7103 30.7582 5966.103 752848661 27343.70 502.5615
## 15 0.2189 880.3182 80.2699 29.8517 6022.490 796890642 25438.09 487.0348
## 16 2.4199 922.6114 49.3633 31.2288 6222.561 679884218 20923.24 435.6133
## 17 0.3103 929.5441 102.2960 31.5981 6341.840 646482976 18610.24 406.0905
## 18 33.1907 1011.3873 27.1355 33.9503 6627.715 480358586 14040.13 352.8524
## 19 1.4961 993.5232 21.9950 33.6677 6724.225 465822219 13109.87 339.2545
## 20 1.8611 971.5756 21.5028 33.2661 6775.070 479734222 11907.97 328.5640
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 36.5514 17.2660 0.2434 0.8208 0.5656 0.4554 661.3258 2.0322 0.2250
## 3 60.2710 28.8491 0.1890 0.8305 0.5224 0.2343 450.9394 5.5229 0.2472
## 4 92.4656 43.4576 0.2315 0.6902 0.5399 0.7079 177.8703 0.7009 0.2360
## 5 100.5557 50.2222 0.2066 0.9837 0.3780 0.2970 283.9735 3.9954 0.2803
## 6 165.8452 72.9891 0.1868 0.8941 0.4042 0.6415 131.8936 0.9474 0.2643
## 7 205.6688 84.8129 0.1609 0.9373 0.4295 0.4094 278.3839 2.4429 0.2633
## 8 267.8373 95.2227 0.1499 0.9172 0.4321 0.5592 123.7745 1.3336 0.2470
## 9 277.6087 104.5829 0.1371 0.8929 0.4687 0.6505 60.7237 0.9068 0.2627
## 10 315.6959 118.0617 0.1605 1.0036 0.4540 0.6802 42.7850 0.7917 0.2544
## 11 327.8350 129.0779 0.1444 1.0453 0.4557 0.5741 45.9885 1.2427 0.2490
## 12 375.8978 137.5839 0.1385 1.0096 0.4784 0.0000 Inf Inf 0.2389
## 13 378.0800 141.8280 0.1367 0.9511 0.5025 0.5130 73.1014 1.5955 0.2311
## 14 386.6055 155.6486 0.1244 0.9249 0.5275 0.0000 Inf Inf 0.2273
## 15 390.1135 160.6107 0.1195 0.8833 0.5618 0.5147 46.1989 1.5730 0.2209
## 16 483.6394 179.5698 0.1106 0.8767 0.5698 0.6128 53.0787 1.0631 0.2152
## 17 495.9660 192.6246 0.0952 0.9081 0.5960 0.4691 46.4012 1.8808 0.2108
## 18 713.3914 221.6876 0.0998 0.9090 0.5959 0.7681 21.1397 0.5069 0.2052
## 19 730.0732 230.5732 0.0868 0.9202 0.5916 0.2529 118.1579 4.9062 0.2020
## 20 734.4826 238.0754 0.0833 0.9023 0.5936 0.6567 24.5654 0.8713 0.1976
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 2265.2288 0.5763 0.9034 0.1984 0.0864 2e-04 2.8000 2.0929 0.7684
## 3 903.8184 0.6282 -0.1457 0.3799 0.0990 2e-04 2.2978 1.6224 0.4529
## 4 449.9959 0.6502 5.3030 0.3701 0.1302 2e-04 1.3926 1.3665 0.1947
## 5 311.5077 0.4575 0.0035 0.9424 0.1302 3e-04 2.6081 1.2529 0.2209
## 6 178.6179 0.4711 0.8973 0.9046 0.1302 3e-04 2.4546 1.0690 0.0813
## 7 131.7572 0.4343 1.5263 1.0996 0.1302 3e-04 2.6312 0.9758 0.0764
## 8 102.6843 0.3961 0.9056 1.3422 0.1302 3e-04 2.9788 0.8826 0.0875
## 9 83.1058 0.3681 0.2548 1.5477 0.1302 3e-04 3.2281 0.8217 0.0713
## 10 66.2561 0.3619 0.2639 1.5371 0.1741 3e-04 3.1527 0.7848 0.0660
## 11 55.0922 0.3564 0.3449 1.5191 0.1741 3e-04 3.1311 0.7519 0.0626
## 12 47.3790 0.3529 1.8785 1.5209 0.1741 3e-04 4.2546 0.7043 0.0572
## 13 42.4257 0.3443 0.2471 1.5945 0.1741 3e-04 4.4482 0.6553 0.0511
## 14 35.8972 0.3395 1.1192 1.5628 0.1741 3e-04 4.5615 0.6265 0.0468
## 15 32.4690 0.3251 0.0936 1.6770 0.1741 3e-04 4.7189 0.5818 0.0428
## 16 27.2258 0.3250 0.2425 1.6015 0.1741 3e-04 4.6139 0.5537 0.0414
## 17 23.8877 0.3176 0.0595 1.5257 0.1741 3e-04 4.7630 0.5320 0.0402
## 18 19.6029 0.3182 0.2710 1.4259 0.2000 3e-04 4.5073 0.5061 0.0363
## 19 17.8555 0.3110 0.3641 1.3509 0.2000 3e-04 5.1115 0.4799 0.0355
## 20 16.4282 0.3081 0.2782 1.3360 0.2000 3e-04 5.0785 0.4547 0.0334
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.6930 244.9257 0.1074
## 3 0.5991 92.3269 0.0010
## 4 0.6805 202.3331 0.5515
## 5 0.5857 84.8793 0.0081
## 6 0.6431 130.9672 0.4172
## 7 0.6280 114.3488 0.0632
## 8 0.6108 100.0491 0.2627
## 9 0.5797 81.9449 0.4379
## 10 0.5563 72.5697 0.4994
## 11 0.5088 59.8593 0.2956
## 12 0.5265 63.8404 0.0000
## 13 0.5367 66.4787 0.1912
## 14 0.5928 88.6202 0.0000
## 15 0.4752 54.1058 0.1984
## 16 0.5471 69.5362 0.3653
## 17 0.4474 50.6396 0.1363
## 18 0.5247 63.3993 0.6779
## 19 0.4434 50.2168 0.0030
## 20 0.4689 53.2292 0.4577
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 4.0000 6.000 4.0000 2.000 3.0000 6 3
## Value_Index 59.3874 1059.812 242.8529 40.761 605.6494 347056632 2951339
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.000 18.0000 18.0000 20.0000 4.0000 17.000 4.0000
## Value_Index 907.531 217.4254 -20.1773 0.0833 0.6902 0.596 0.7079
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain Dunn
## Number_clusters 4.0000 2.0000 5.0000 3.00 4.0000 1 2.0000 18.0
## Value_Index 177.8703 2.0322 0.2803 1361.41 0.6502 NA 0.1984 0.2
## Hubert SDindex Dindex SDbw
## Number_clusters 0 4.0000 0 20.0000
## Value_Index 0 1.3926 0 0.0334
##
## $Best.partition
## [1] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 2 3 3 2 1 1 3 1 2 2 1 1 4 1 1 3 1
## [38] 2 1 3 3 3 1 3 2 3 1 1 3 2 1 1 2 1 1 3 3 1 1 3 2 3 1 3 1 2 2 1 2 1 1 3 3 2
## [75] 2 2 3 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 3 1 4 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 2 1 1 3 3 3 3 3 3 3 3 2 3 1
## [149] 3 3 2 1 1 3 3 3 3 3 1 3 3 3 3 1 3 1 1 1 3 3 3 3 2 3 3 1 1 2 1 1 1 1 1 1 1
## [186] 1 1 3 3 3 3 2 1 3 1 3 3 1 1 1 2 1 4 1 2 1 1 2 3 2 2 2 1 2 3 1 2 2 1 2 2 1
## [223] 3 1 2 1 1 4 2 1 1 2 4 2 1 2 3 2 1 2 2 2 4 1 1 2 2 2 1 2 2 2 2 2 2 1 2 3 3
## [260] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 3 2 2 1 1 2 2 2 1 2 2 2 1 2 2 1 1 1 2
## [297] 1 1 1 3 2 1 2 1 3 1 3 2 3 3 3 1 3 2 1 3 3 1 1 1 3 1 3 3 1 2 2 3 1 3 1 2 1
## [334] 3 3 1 3 3 1 1 1 2 1 1 1 1 1 1 1 2 1 3 1 1 1 1 1 1 3 1 1 2 3 3 3 2 3 3 2 2
## [371] 3 1 1 3 3 1 4 1 4 4 1 4 4 1 4 2 4 1 4 1 4 1 4 1 4 4 4 1 3 1 2 2 3 1 1 3 1
## [408] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 1 3 1 1 1 3 1 1 1 1 4 1 3 1 1 1 1 1 4
## [519] 1 1 1 1 1 4 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [556] 1 1 2 1 3 2 1 2 2 2 1 3 1 1 1 1 1 1 3 1 1 1 1 3 3 3 2 3 2 1 3 1 1 1 2 1 3
## [593] 3 2 1 1 1 2 1 1 3 1 2 2 1 1 4 1 4 1 1 1 1 1 2 2 2 2 1 2 1 2 3 1 1 1 1 1 1
## [630] 1 2 1 2 3 1 3 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 3 1 1 1 1
## [667] 1 1 1 3 1 3 1 2 1 2 3 1 3 1 2 1 3 1 1 2 2 1 3 3 1 1 1 2 1
kmode.4<-kmodes(michelin.kmodes,
4,
iter.max = 10,
weighted = FALSE)
cluster.info<-kmode.4$modes %>%
mutate(.,
cuisine.collapsed=levels(michelin$cuisine.collapsed)[.$cuisine.collapsed],
price=strrep("$",.$price))
michelin$cluster<-kmode.4$cluster
cluster.info
## cuisine.collapsed price stars
## 1 Chinese $$$ 1
## 2 Modern $$$ 1
## 3 Japanese $$$$ 1
## 4 Modern $$$$ 1
plot(michelin.kmodes,col=kmode.4$cluster)
michelin.kmodes %>%
mutate(.,cluster=kmode.4$cluster) %>%
ggplot(.,aes(x=stars,y=price,color=cluster,fill=cluster)) +
geom_point(size=2) +
geom_jitter()
michelin.kmodes %>%
mutate(.,cluster=kmode.4$cluster) %>%
ggplot(.,aes(x=cuisine.collapsed,y=price,color=cluster,fill=cluster)) +
geom_point(size=2) +
geom_jitter()
michelin.kmodes %>%
mutate(.,cluster=kmode.4$cluster) %>%
ggplot(.,aes(x=cuisine.collapsed,y=stars,color=cluster,fill=cluster)) +
geom_point(size=2) +
geom_jitter()
# Plotly Plots
michelin.3d<-michelin.kmodes %>%
mutate(.,cluster=kmode.4$cluster) %>%
plot_ly(x=.$cuisine.collapsed,
y=.$price,
z=.$stars,
type="scatter3d",
mode="markers",
color=.$cluster,
showlegend=FALSE,
hoverinfo="text",
hovertext=paste(michelin$name,":",
"A",strrep("$",michelin$price),
michelin$stars,"Michelin star",
michelin$cuisine.collapsed,"restaurant",
"in",michelin$city))
michelin.3d
saveWidget(michelin.3d,"michelin_3d.html")
Sys.setenv("MAPBOX_TOKEN"="pk.eyJ1IjoicnlhbmNhaGlsZGVicmFuZHQiLCJhIjoiY2tiNWd0MzJmMTN5MzJybXZ0cnp2N2c0MSJ9.qh0GjKns3qfkdZFxLlG4Lw")
michelin.map<-plot_mapbox(maps::world.cities) %>%
add_markers(
x = michelin$longitude,
y = michelin$latitude,
size = as.numeric(michelin$price)**5,
color = michelin$stars,
hoverinfo="text",
hovertext=paste(michelin$name,":",
"A",strrep("$",michelin$price),
michelin$stars,"Michelin star",
michelin$cuisine.collapsed,"restaurant",
"in",michelin$city))
michelin.map
saveWidget(michelin.map,"michelin_map.html")